<!DOCTYPE html>
<html>
<head><meta name="generator" content="Hexo 3.9.0">
  <meta charset="utf-8">
  
  <title>【nlp】相似度检索-bm25 | MaxMa</title>
  <meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1">
  
  <meta name="keywords" content="BM25">
  
  
  
  
  <meta name="description" content="概述BM25算法，通常用来作搜索相关性平分。一句话概况其主要思想：对Query进行语素解析，生成语素；然后，对于候选集D中的每个d，计算每个语素qi与d的相关性得分，最后，将qi相对于d的相关性得分进行加权求和，从而得到Query与d的相关性得分。 BM25一般公式 Score(Q,d)=\sum^{n}_{i} W_{i} \cdot R(q_{i},d)其中，$Q$表示Query，$q_i$表">
<meta name="keywords" content="BM25">
<meta property="og:type" content="article">
<meta property="og:title" content="【NLP】相似度检索-BM25">
<meta property="og:url" content="https://anxiang1836.github.io/2020/01/07/NLP_BM25/index.html">
<meta property="og:site_name" content="MaxMa">
<meta property="og:description" content="概述BM25算法，通常用来作搜索相关性平分。一句话概况其主要思想：对Query进行语素解析，生成语素；然后，对于候选集D中的每个d，计算每个语素qi与d的相关性得分，最后，将qi相对于d的相关性得分进行加权求和，从而得到Query与d的相关性得分。 BM25一般公式 Score(Q,d)=\sum^{n}_{i} W_{i} \cdot R(q_{i},d)其中，$Q$表示Query，$q_i$表">
<meta property="og:locale" content="zh-CN">
<meta property="og:updated_time" content="2020-01-07T08:44:52.492Z">
<meta name="twitter:card" content="summary">
<meta name="twitter:title" content="【NLP】相似度检索-BM25">
<meta name="twitter:description" content="概述BM25算法，通常用来作搜索相关性平分。一句话概况其主要思想：对Query进行语素解析，生成语素；然后，对于候选集D中的每个d，计算每个语素qi与d的相关性得分，最后，将qi相对于d的相关性得分进行加权求和，从而得到Query与d的相关性得分。 BM25一般公式 Score(Q,d)=\sum^{n}_{i} W_{i} \cdot R(q_{i},d)其中，$Q$表示Query，$q_i$表">
  
    <link rel="alternate" href="/atom.xml" title="MaxMa" type="application/atom+xml">
  

  

  <link rel="icon" href="/css/images/mylogo.jpg">
  <link rel="apple-touch-icon" href="/css/images/mylogo.jpg">
  
    <link href="//fonts.googleapis.com/css?family=Source+Code+Pro" rel="stylesheet" type="text/css">
  
  <link href="https://fonts.googleapis.com/css?family=Open+Sans|Montserrat:700" rel="stylesheet" type="text/css">
  <link href="https://fonts.googleapis.com/css?family=Roboto:400,300,300italic,400italic" rel="stylesheet" type="text/css">
  <link href="//netdna.bootstrapcdn.com/font-awesome/4.0.3/css/font-awesome.css" rel="stylesheet">
  <style type="text/css">
    @font-face{font-family:futura-pt; src:url("css/fonts/FuturaPTBold.otf") format("woff");font-weight:500;font-style:normal;}
    @font-face{font-family:futura-pt-light; src:url("css/fonts/FuturaPTBook.otf") format("woff");font-weight:lighter;font-style:normal;}
    @font-face{font-family:futura-pt-italic; src:url("css/fonts/FuturaPTBookOblique.otf") format("woff");font-weight:400;font-style:italic;}
}

  </style>
  <link rel="stylesheet" href="/css/style.css">

  <script src="/js/jquery-3.1.1.min.js"></script>
  <script src="/js/bootstrap.js"></script>

  <!-- Bootstrap core CSS -->
  <link rel="stylesheet" href="/css/bootstrap.css">

  
    <link rel="stylesheet" href="/css/dialog.css">
  

  

  
    <link rel="stylesheet" href="/css/header-post.css">
  

  
  
  

</head>
</html>


  <body data-spy="scroll" data-target="#toc" data-offset="50">


  
  <div id="container">
    <div id="wrap">
      
        <header>

    <div id="allheader" class="navbar navbar-default navbar-static-top" role="navigation">
        <div class="navbar-inner">
          
          <div class="container"> 
            <button type="button" class="navbar-toggle" data-toggle="collapse" data-target=".navbar-collapse">
              <span class="sr-only">Toggle navigation</span>
              <span class="icon-bar"></span>
              <span class="icon-bar"></span>
              <span class="icon-bar"></span>
            </button>

            
              <a class="brand" style="
                 margin-top: 0px;"  
                href="#" data-toggle="modal" data-target="#myModal" >
                  <img width="124px" height="124px" alt="Hike News" src="/css/images/mylogo.jpg">
              </a>
            
            
            <div class="navbar-collapse collapse">
              <ul class="hnav navbar-nav">
                
                  <li> <a class="main-nav-link" href="/">首页</a> </li>
                
                  <li> <a class="main-nav-link" href="/archives">归档</a> </li>
                
                  <li> <a class="main-nav-link" href="/categories">分类</a> </li>
                
                  <li> <a class="main-nav-link" href="/tags">标签</a> </li>
                
                  <li> <a class="main-nav-link" href="/about">关于</a> </li>
                
                  <li><div id="search-form-wrap">

    <form class="search-form">
        <input type="text" class="ins-search-input search-form-input" placeholder="" />
        <button type="submit" class="search-form-submit"></button>
    </form>
    <div class="ins-search">
    <div class="ins-search-mask"></div>
    <div class="ins-search-container">
        <div class="ins-input-wrapper">
            <input type="text" class="ins-search-input" placeholder="请输入关键词..." />
            <span class="ins-close ins-selectable"><i class="fa fa-times-circle"></i></span>
        </div>
        <div class="ins-section-wrapper">
            <div class="ins-section-container"></div>
        </div>
    </div>
</div>
<script>
(function (window) {
    var INSIGHT_CONFIG = {
        TRANSLATION: {
            POSTS: '文章',
            PAGES: '页面',
            CATEGORIES: '分类',
            TAGS: '标签',
            UNTITLED: '(无标题)',
        },
        ROOT_URL: '/',
        CONTENT_URL: '/content.json',
    };
    window.INSIGHT_CONFIG = INSIGHT_CONFIG;
})(window);
</script>
<script src="/js/insight.js"></script>

</div></li>
            </div>
          </div>
                
      </div>
    </div>

</header>



      
            
      <div id="content" class="outer">
        
          <section id="main" style="float:none;"><article id="post-NLP_BM25" style="width: 75%; float:left;" class="article article-type-post" itemscope itemprop="blogPost" >
  <div id="articleInner" class="article-inner">
    
    
      <header class="article-header">
        
  
    <h1 class="thumb" class="article-title" itemprop="name">
      【NLP】相似度检索-BM25
    </h1>
  

      </header>
    
    <div class="article-meta">
      
	<a href="/2020/01/07/NLP_BM25/" class="article-date">
	  <time datetime="2020-01-07T02:39:58.970Z" itemprop="datePublished">2020-01-07</time>
	</a>

      
    <a class="article-category-link" href="/categories/NLP/">NLP</a>

      
	<a class="article-views">
	<span id="busuanzi_container_page_pv">
		阅读量<span id="busuanzi_value_page_pv"></span>
	</span>
	</a>

      

    </div>
    <div class="article-entry" itemprop="articleBody">
      
        <h2 id="概述"><a href="#概述" class="headerlink" title="概述"></a>概述</h2><p>BM25算法，通常用来作搜索相关性平分。一句话概况其主要思想：对Query进行语素解析，生成语素；然后，对于候选集D中的每个d，计算每个语素qi与d的相关性得分，最后，将qi相对于d的相关性得分进行加权求和，从而得到Query与d的相关性得分。</p>
<h2 id="BM25一般公式"><a href="#BM25一般公式" class="headerlink" title="BM25一般公式"></a>BM25一般公式</h2><script type="math/tex; mode=display">
Score(Q,d)=\sum^{n}_{i} W_{i} \cdot R(q_{i},d)</script><p>其中，$Q$表示Query，$q_i$表示$Q$解析之后的一个语素；</p>
<blockquote>
<p>可以理解为对于$Q$进行分词后去停用词的内容List</p>
</blockquote>
<p>$d$表示候选文档；</p>
<p>$W_i$表示$Q$中的语素$q_i$的权重；$R(q_i,d)$表示语素$q_i$与$d$的相关性得分</p>
<h2 id="BM25进一步理解"><a href="#BM25进一步理解" class="headerlink" title="BM25进一步理解"></a>BM25进一步理解</h2><h3 id="W-i-的计算方式"><a href="#W-i-的计算方式" class="headerlink" title="$W_i$的计算方式"></a>$W_i$的计算方式</h3><p>在上文已经提及到了，$W_i$是表示一个词与一个文档的相关性的权重，较为常用的是IDF。IDF的公式如下：</p>
<script type="math/tex; mode=display">
IDF(q_i)=log\frac{N-n(q_i)+0.5}{n(q_i)+0.5}</script><p>其中，$N$为候选集文章集合中的文章的数量；$n(q_i)$表示包含$q_i$的文档数。</p>
<blockquote>
<p>根据IDF的定义可以看出，对于给定候选文章集合，包含了$q_i$的文档数越多，$q_i$的权重则越低。这个从熵的角度理解，如果一个词语基本在大部分文章都出现，那么该词对于文章的区分度贡献就趋近于0，进而用于计算$q_i$和$d$的相关性时的权重当然就很较低咯。</p>
</blockquote>
<h3 id="R-q-i-d-的计算方式"><a href="#R-q-i-d-的计算方式" class="headerlink" title="$R(q_i,d)$的计算方式"></a>$R(q_i,d)$的计算方式</h3><p>相关性得分的一般形式为：</p>
<script type="math/tex; mode=display">
R(q_i,d)=\frac{f_{i}\cdot(k_{1}+1)}{f_{i}+K}\cdot 
    \frac{qf_{i}\cdot (k_2+1)}{qf_i+k_2}</script><script type="math/tex; mode=display">
K=k_1\cdot(1-b+b \cdot \frac{len(d)}{avg\_len(D)})</script><p>其中，$k_1，k_2,b$为调节因子，通常根据经验设置，$k_1=2,k_2=1,b=0.75$；</p>
<p>$f_i$为$q_i$在$d$中出现的频率；$qf_i$为$q_i$在Query中出现的频率。</p>
<p>$len(d)$为文档$d$的长度；$avg_len(D)$为$D$中所有文档的平均长度。</p>
<blockquote>
<p>参数b的意义：调整文章长度对于相关性的影响的大小。b越大，文档的长度对于相关性的影响越大</p>
</blockquote>

      
    </div>
    <footer class="article-footer">
      
      
      
        
	<div id="comment">
		<!-- 来必力City版安装代码 -->
		<div id="lv-container" data-id="city" data-uid="MTAyMC80NTk2OS8yMjQ4MA==">
		<script type="text/javascript">
		   (function(d, s) {
		       var j, e = d.getElementsByTagName(s)[0];

		       if (typeof LivereTower === 'function') { return; }

		       j = d.createElement(s);
		       j.src = 'https://cdn-city.livere.com/js/embed.dist.js';
		       j.async = true;

		       e.parentNode.insertBefore(j, e);
		   })(document, 'script');
		</script>
		<noscript>为正常使用来必力评论功能请激活JavaScript</noscript>
		</div>
		<!-- City版安装代码已完成 -->
	</div>



      
      
        
  <ul class="article-tag-list"><li class="article-tag-list-item"><a class="article-tag-list-link" href="/tags/BM25/">BM25</a></li></ul>

      

    </footer>
  </div>
  
    
<nav id="article-nav">
  
  
    <a href="/2019/11/05/NLP_From_HMM_to_CRF/" id="article-nav-older" class="article-nav-link-wrap">
      <strong class="article-nav-caption">下一篇</strong>
      <div class="article-nav-title">【NLP】从隐马尔科夫到条件随机场</div>
    </a>
  
</nav>

  
</article>

<!-- Table of Contents -->

  <aside id="toc-sidebar">
    <div id="toc" class="toc-article">
    <strong class="toc-title">文章目录</strong>
    
        <ol class="nav"><li class="nav-item nav-level-2"><a class="nav-link" href="#概述"><span class="nav-number">1.</span> <span class="nav-text">概述</span></a></li><li class="nav-item nav-level-2"><a class="nav-link" href="#BM25一般公式"><span class="nav-number">2.</span> <span class="nav-text">BM25一般公式</span></a></li><li class="nav-item nav-level-2"><a class="nav-link" href="#BM25进一步理解"><span class="nav-number">3.</span> <span class="nav-text">BM25进一步理解</span></a><ol class="nav-child"><li class="nav-item nav-level-3"><a class="nav-link" href="#W-i-的计算方式"><span class="nav-number">3.1.</span> <span class="nav-text">$W_i$的计算方式</span></a></li><li class="nav-item nav-level-3"><a class="nav-link" href="#R-q-i-d-的计算方式"><span class="nav-number">3.2.</span> <span class="nav-text">$R(q_i,d)$的计算方式</span></a></li></ol></li></ol>
    
    </div>
  </aside>

</section>
        
      </div>
      
      <footer id="footer">
  

  <div class="container">
      	<div class="row">
	      <p> Powered by <a href="http://hexo.io/" target="_blank">Hexo</a> and <a href="https://github.com/iTimeTraveler/hexo-theme-hiker" target="_blank">Hexo-theme-hiker</a> </p>
	      <p id="copyRightEn">Copyright &copy; 2013 - 2020 MaxMa All Rights Reserved.</p>
	      
	      
    		<p class="busuanzi_uv">
				访客数 : <span id="busuanzi_value_site_uv"></span> |  
				访问量 : <span id="busuanzi_value_site_pv"></span>
		    </p>
  		   
		</div>

		
  </div>
</footer>


<!-- min height -->

<script>
    var wrapdiv = document.getElementById("wrap");
    var contentdiv = document.getElementById("content");
    var allheader = document.getElementById("allheader");

    wrapdiv.style.minHeight = document.body.offsetHeight + "px";
    if (allheader != null) {
      contentdiv.style.minHeight = document.body.offsetHeight - allheader.offsetHeight - document.getElementById("footer").offsetHeight + "px";
    } else {
      contentdiv.style.minHeight = document.body.offsetHeight - document.getElementById("footer").offsetHeight + "px";
    }
</script>
    </div>
    <!-- <nav id="mobile-nav">
  
    <a href="/" class="mobile-nav-link">Home</a>
  
    <a href="/archives" class="mobile-nav-link">Archives</a>
  
    <a href="/categories" class="mobile-nav-link">Categories</a>
  
    <a href="/tags" class="mobile-nav-link">Tags</a>
  
    <a href="/about" class="mobile-nav-link">About</a>
  
</nav> -->
    

<!-- mathjax config similar to math.stackexchange -->

<script type="text/x-mathjax-config">
  MathJax.Hub.Config({
    tex2jax: {
      inlineMath: [ ['$','$'], ["\\(","\\)"] ],
      processEscapes: true
    }
  });
</script>

<script type="text/x-mathjax-config">
    MathJax.Hub.Config({
      tex2jax: {
        skipTags: ['script', 'noscript', 'style', 'textarea', 'pre', 'code']
      }
    });
</script>

<script type="text/x-mathjax-config">
    MathJax.Hub.Queue(function() {
        var all = MathJax.Hub.getAllJax(), i;
        for(i=0; i < all.length; i += 1) {
            all[i].SourceElement().parentNode.className += ' has-jax';
        }
    });
</script>

<script type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.1/MathJax.js?config=TeX-AMS-MML_HTMLorMML">
</script>


  <link rel="stylesheet" href="/fancybox/jquery.fancybox.css">
  <script src="/fancybox/jquery.fancybox.pack.js"></script>


<script src="/js/scripts.js"></script>




  <script src="/js/dialog.js"></script>








	<div style="display: none;">
    <script src="https://s95.cnzz.com/z_stat.php?id=1260716016&web_id=1260716016" language="JavaScript"></script>
  </div>



	<script async src="//busuanzi.ibruce.info/busuanzi/2.3/busuanzi.pure.mini.js">
	</script>






  </div>

  <div class="modal fade" id="myModal" tabindex="-1" role="dialog" aria-labelledby="myModalLabel" aria-hidden="true" style="display: none;">
  <div class="modal-dialog">
    <div class="modal-content">
      <div class="modal-header">
        <h2 class="modal-title" id="myModalLabel">设置</h2>
      </div>
      <hr style="margin-top:0px; margin-bottom:0px; width:80%; border-top: 3px solid #000;">
      <hr style="margin-top:2px; margin-bottom:0px; width:80%; border-top: 1px solid #000;">


      <div class="modal-body">
          <div style="margin:6px;">
            <a data-toggle="collapse" data-parent="#accordion" href="#collapseOne" onclick="javascript:setFontSize();" aria-expanded="true" aria-controls="collapseOne">
              正文字号大小
            </a>
          </div>
          <div id="collapseOne" class="panel-collapse collapse" role="tabpanel" aria-labelledby="headingOne">
          <div class="panel-body">
            您已调整页面字体大小
          </div>
        </div>
      


          <div style="margin:6px;">
            <a data-toggle="collapse" data-parent="#accordion" href="#collapseTwo" onclick="javascript:setBackground();" aria-expanded="true" aria-controls="collapseTwo">
              夜间护眼模式
            </a>
        </div>
          <div id="collapseTwo" class="panel-collapse collapse" role="tabpanel" aria-labelledby="headingTwo">
          <div class="panel-body">
            夜间模式已经开启，再次单击按钮即可关闭 
          </div>
        </div>

        <div>
            <a data-toggle="collapse" data-parent="#accordion" href="#collapseThree" aria-expanded="true" aria-controls="collapseThree">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;关 于&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</a>
        </div>
         <div id="collapseThree" class="panel-collapse collapse" role="tabpanel" aria-labelledby="headingThree">
          <div class="panel-body">
            MaxMa
          </div>
          <div class="panel-body">
            Copyright © 2020 MaxMa All Rights Reserved.
          </div>
        </div>
      </div>


      <hr style="margin-top:0px; margin-bottom:0px; width:80%; border-top: 1px solid #000;">
      <hr style="margin-top:2px; margin-bottom:0px; width:80%; border-top: 3px solid #000;">
      <div class="modal-footer">
        <button type="button" class="close" data-dismiss="modal" aria-label="Close"><span aria-hidden="true">×</span></button>
      </div>
    </div>
  </div>
</div>
  
  <a id="rocket" href="#top" class=""></a>
  <script type="text/javascript" src="/js/totop.js?v=1.0.0" async=""></script>
  
    <a id="menu-switch"><i class="fa fa-bars fa-lg"></i></a>
  
<script type="text/x-mathjax-config">
    MathJax.Hub.Config({
        tex2jax: {
            inlineMath: [ ["$","$"], ["\\(","\\)"] ],
            skipTags: ['script', 'noscript', 'style', 'textarea', 'pre', 'code'],
            processEscapes: true
        }
    });
    MathJax.Hub.Queue(function() {
        var all = MathJax.Hub.getAllJax();
        for (var i = 0; i < all.length; ++i)
            all[i].SourceElement().parentNode.className += ' has-jax';
    });
</script>
<script src="http://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML"></script>
</body>
</html>